{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "22f037ad-d2ec-485a-9798-9ac8bf350480",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# Anscombe's Quartet\n",
    "\n",
    "The raw data has four series. We'll ingest the data in several forms\n",
    "\n",
    "## Raw Data for the Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fe69e130-bb5d-434d-8b46-8fb5db087874",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from pydantic import BaseModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "04659851-367b-452c-8edf-571dc1872a1f",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['I', 'II', 'III', 'IV']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "source_path = Path.cwd().parent.parent / \"data\" / \"anscombe.json\"\n",
    "with source_path.open() as source_file:\n",
    "    all_data = json.load(source_file)\n",
    "[data['series'] for data in all_data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b807c8ec-b1ad-45f9-b694-d8ed4481b762",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'x': 10.0, 'y': 8.04},\n",
       " {'x': 8.0, 'y': 6.95},\n",
       " {'x': 13.0, 'y': 7.58},\n",
       " {'x': 9.0, 'y': 8.81},\n",
       " {'x': 11.0, 'y': 8.33},\n",
       " {'x': 14.0, 'y': 9.96},\n",
       " {'x': 6.0, 'y': 7.24},\n",
       " {'x': 4.0, 'y': 4.26},\n",
       " {'x': 12.0, 'y': 10.84},\n",
       " {'x': 7.0, 'y': 4.82},\n",
       " {'x': 5.0, 'y': 5.68}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data[0]['data']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2543b913-5eeb-453b-bfed-7f9ea7922eca",
   "metadata": {},
   "source": [
    "## Pydantic Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8bf64032-1120-4e94-a08b-f9902b513f1f",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pydantic import BaseModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "99a55c73-67ad-4ce0-9faf-7921a179e5f2",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "class Pair(BaseModel):\n",
    "    x: float\n",
    "    y: float\n",
    "\n",
    "class Series(BaseModel):\n",
    "    series: str\n",
    "    data: list[Pair]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9f5030f1-f7bb-4254-a829-86cf198eed9c",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "clean_data = [Series.model_validate(d) for d in all_data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "df4a3189-55df-42f0-ad86-0b61e41cedba",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series(series='I', data=[Pair(x=10.0, y=8.04), Pair(x=8.0, y=6.95), Pair(x=13.0, y=7.58), Pair(x=9.0, y=8.81), Pair(x=11.0, y=8.33), Pair(x=14.0, y=9.96), Pair(x=6.0, y=7.24), Pair(x=4.0, y=4.26), Pair(x=12.0, y=10.84), Pair(x=7.0, y=4.82), Pair(x=5.0, y=5.68)])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "13108efb-7c10-4250-8c24-4bff9d53f66c",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series(series='II', data=[Pair(x=10.0, y=9.14), Pair(x=8.0, y=8.14), Pair(x=13.0, y=8.74), Pair(x=9.0, y=8.77), Pair(x=11.0, y=9.26), Pair(x=14.0, y=8.1), Pair(x=6.0, y=6.13), Pair(x=4.0, y=3.1), Pair(x=12.0, y=9.13), Pair(x=7.0, y=7.26), Pair(x=5.0, y=4.74)])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_data[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "307400ff-8ac1-4a6f-a3a4-ba270726fd18",
   "metadata": {},
   "outputs": [],
   "source": [
    "quartet = {s.series: s for s in clean_data}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "611d4f33-6208-472d-bdc5-613d315bf211",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series(series='I', data=[Pair(x=10.0, y=8.04), Pair(x=8.0, y=6.95), Pair(x=13.0, y=7.58), Pair(x=9.0, y=8.81), Pair(x=11.0, y=8.33), Pair(x=14.0, y=9.96), Pair(x=6.0, y=7.24), Pair(x=4.0, y=4.26), Pair(x=12.0, y=10.84), Pair(x=7.0, y=4.82), Pair(x=5.0, y=5.68)])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quartet['I']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "226e0903-fe15-4f7a-bc0f-50b149ed468f",
   "metadata": {},
   "outputs": [],
   "source": [
    "assert len(quartet) == 4, f\"read {len(quartet)} series\"\n",
    "assert list(quartet.keys()) == [\"I\", \"II\", \"III\", \"IV\"], f\"keys were {list(quartet.keys())}\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "443e9647-e26b-4436-a14e-995eee644e9f",
   "metadata": {},
   "source": [
    "## Revised Design\n",
    "\n",
    "This is a more focused data loading cell."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ae4c49b-8722-4e7a-bb3f-d9697c39999b",
   "metadata": {},
   "outputs": [],
   "source": [
    "source_path = Path.cwd().parent.parent / \"data\" / \"anscombe.json\"\n",
    "with source_path.open() as source_file:\n",
    "    json_document = json.load(source_file)\n",
    "    source_data = (Series.model_validate(s) for s in json_document)\n",
    "    quartet = {s.series: s for s in source_data}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d34e1d6a-64a4-4988-b677-d6ba9c33775b",
   "metadata": {},
   "outputs": [],
   "source": [
    "quartet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f93f612-b9c8-4853-9d48-b93dac63bfc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "for name in quartet:\n",
    "    print(f\"{name:3s} {len(quartet[name].data):d}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d21fc7b7-6092-4689-b1e5-4f297860c584",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
